import os
import numpy as np
import librosa

from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection
from lib.conf import tts_dir
from lib.models import default_voice_detection_model

class BackgroundDetector:
    def __init__(self,wav_file:str):
        self.wav_file=wav_file
        model=Model.from_pretrained(default_voice_detection_model,cache_dir=tts_dir)
        self.pipeline=VoiceActivityDetection(segmentation=model)
        hyper_params={
          # onset/offset activation thresholds
          "onset":0.5,"offset":0.5,
          # remove speech regions shorter than that many seconds.
          "min_duration_on":0.0,
          # fill non-speech regions shorter than that many seconds.
          "min_duration_off":0.0
        }
        self.pipeline.instantiate(hyper_params)

    def detect(self,vad_ratio_thresh:float=0.05)->tuple[bool,dict[str,float|bool]]:
        diarization=self.pipeline(self.wav_file)
        speech_segments=[(s.start,s.end) for s in diarization.get_timeline()]
        total_duration=librosa.get_duration(path=self.wav_file)
        speech_time=sum(end-start for start,end in speech_segments)
        non_speech_ratio=1-(speech_time/total_duration)
        status=non_speech_ratio>vad_ratio_thresh
        report={
            'non_speech_ratio':non_speech_ratio,
            'background_detected':status
        }
        return status,report